Reproducible analysis of the Indian dataset (IND)

library(knitr)
library(genio)
library(ggtree)
library(popkin)
library(BEDMatrix)
library(seriation)
library(superadmixture)
library(tidyverse)

# for plotting
library(ggplot2)
library(grid)
library(gridGraphics)
library(ggplotify)
library(ggpubr)
library(cowplot)
library(patchwork)
library(latex2exp)

1 Data Preprocessing

In this document, we demonstrate our procedures for analysis of merged dataset of the mainland Indians from the IND study with the Central/South Asia and the East Asia populations from HGDP to study. We obtained the assembled genotypes India367.{bed,bim,fam} of IND study from Dr. Analabha Basu Basu et al.. The raw HGDP is available at the webpage, attributed to Bergström et al. (2020). Here we start with the assembled version of HGDP, provided by the PLINK. PLINK hosts the assembled data at the following dropbox link:

The associated annotations can be found at the FTP site: ftp://ngs.sanger.ac.uk/production/hgdp/hgdp_wgs.20190516/metadata/hgdp_wgs.20190516.metadata.txt

We use PLINK, PLINK2 and zstd for data preprocessing. The PLINK software can be downloaded from this webpage. The PLINK2 software can be downloaded from this webpage. The ztsd is hosted at the github, and can downloaded and installed through the following commands.

git clone https://github.com/facebook/zstd.git
cd zstd
make install

We download and unzip the assembled HGDP data from the dropbox stated above. We create a folder called rawdata to host the downloaded raw data and pre-processed data.

mkdir -p rawdata && cd $_

zstd=<PATH_TO_ZSTD_EXECUTABLE>

## download `.pgen` file
wget "https://www.dropbox.com/s/hppj1g1gzygcocq/hgdp_all.pgen.zst?dl=1"
mv "hgdp_all.pgen.zst?dl=1" "hgdp_all.pgen.zst"
./$zstd -d "hgdp_all.pgen.zst"
rm "hgdp_all.pgen.zst"

## download `.pvar` file
wget "https://www.dropbox.com/s/1mmkq0bd9ax8rng/hgdp_all.pvar.zst?dl=1"
mv "hgdp_all.pvar.zst?dl=1" "hgdp_all.pvar.zst"

## download `.psam` file
wget "https://www.dropbox.com/s/0zg57558fqpj3w1/hgdp.psam?dl=1"
mv "hgdp.psam?dl=1" "hgdp_all.psam"

## download metadata
 wget "ftp://ngs.sanger.ac.uk/production/hgdp/hgdp_wgs.20190516/metadata/hgdp_wgs.20190516.metadata.txt"

This step serves set missing IDs to unique values to avoid these being detected as repeated IDs

PLINK2=<PATH_TO_PLINK2_EXECUTABLE>

./$PLINK2 --pfile hgdp_all vzs \
          --set-missing-var-ids '@:#' \
          --make-just-pvar zs \
          --out hgdp_all_uniq
          
# replace data
mv hgdp_all_uniq.pvar.zst hgdp_all.pvar.zst

# trash
rm hgdp_all_uniq.log

This step serves to convert files from zvs format to PLINK binary format.

./$PLINK2 \
    --pfile hgdp_all vzs \
    --var-filter \
    --snps-only just-acgt \
    --max-alleles 2 \
    --make-bed \
    --out hgdp_wgs_autosomes

Then we add subpopulation labels to FAM files.

awk -F" " '{
  if (NR == FNR) {
    population[$1]=$6;
    if ($10 == "M") {
      gender[$1]=1;
    } else if ($10 == "F") {
      gender[$1]=2;
    } else {
      gender[$1]=0;
    }
  } 
  else print population[$2],$2,$3,$4,gender[$2],$6;
}' hgdp_wgs.20190516.metadata.txt hgdp_wgs_autosomes.fam > hgdp_wgs_autosomes.fam.NEW
mv hgdp_wgs_autosomes.fam.NEW hgdp_wgs_autosomes.fam

This step serves to combine India367.{bed,bim,fam} with hgdp_wgs_autosomes.{bed,bim,fam}.

PLINK=<PATH_TO_PLINK_EXECUTABLE>
# get the SNP id
awk '{print $2}' India367.bim > India367_snps.txt

# extract the portion of genetic data that overlap with India367
./$PLINK --bfile hgdp_wgs_autosomes \
         --extract India367_snps.txt \
         --out hgdp_overlap \
         --allow-extra-chr \
         --make-bed \
         --geno 0.01 \
         --mind 0.01 \
         --maf 0.01
    
awk '{print $2}' hgdp_overlap.bim > hgdp_snps.txt
./$PLINK --bfile India367 \
         --extract hgdp_snps.txt \
         --out India367_overlap \
         --allow-extra-chr \
        --make-bed \
        --geno 0.01 \
        --mind 0.01 \
        --maf 0.01  
    
# merge two bfiles
./$PLINK --bfile hgdp_overlap \
      --bmerge India367_overlap \
      --make-bed \
      --out India367_final 
./$PLINK --bfile hgdp_overlap --exclude India367_final-merge.missnp --make-bed --out hgdp_overlap-tmp
./$PLINK --bfile India367_overlap --exclude India367_final-merge.missnp --make-bed --out India367_overlap-tmp
./$PLINK --bfile hgdp_overlap-tmp \
    --bmerge India367_overlap-tmp \
    --make-bed \
    --geno 0.01 \
    --maf 0.01 \
    --out India367_final

This code chunk cleans up the rawdata folder and removed the temporary files.

rm India367_snps.txt
rm India367_final.{log,nosex}
rm India367_final-merge.missnp
rm India367_overlap-tmp.{bed,bim,fam,nosex,log}
rm India367_overlap.{bed,bim,fam,nosex,log,irem}

rm hgdp_snps.txt
rm hgdp_overlap-tmp.{bed,bim,fam,log}
rm hgdp_overlap.{bed,bim,fam,log,irem}

2 Estimating individual-level coancestry

In the following sections, the intermediate data will be stored at the rdata folder.

X    <- BEDMatrix::BEDMatrix("rawdata/India367_final", simple_names = TRUE)
fam  <- genio::read_fam("rawdata/India367_final")
info <- readr::read_tsv( "rawdata/hgdp_wgs.20190516.metadata.txt", col_types = 'ccccccddccddddd') %>%
  dplyr::select(c('sample', 'population', 'latitude', 'longitude', 'region'))

This code chunk cleans up the population annotation files.

fam <- dplyr::left_join(fam, info, by = c("id" = "sample")) %>%
       dplyr::rename("subpop" = "region") %>%
       dplyr::mutate(subpop = ifelse(subpop == "AFRICA", "Africa",
                              ifelse(subpop == "MIDDLE_EAST", "MiddleEast",
                              ifelse(subpop == "EUROPE", "Europe",
                              ifelse(subpop == "CENTRAL_SOUTH_ASIA", "CSAsia",
                              ifelse(subpop == "EAST_ASIA", "EAsia",
                              ifelse(subpop == "AMERICA", "Americas", "Oceania")))))))

# label population for Indians
fam$population <- sapply(seq_along(fam$population), function(index) {
  population <- fam$population[index]
  if (is.na(population)) {
    id <- fam$id[index]
    return(unlist(stringr::str_split(id, "-|_"))[1])
  } else {
    return(population)
  } 
})

fam <- fam %>% dplyr::mutate(population = ifelse(population == "MT",  "MRT", population)) %>%
               dplyr::mutate(population = ifelse(population == "BR2", "WBR", population)) %>%
               dplyr::mutate(population = ifelse(population == "GD",  "GND", population)) %>%
               dplyr::mutate(population = ifelse(population == "IR",  "IYR", population)) %>%
               dplyr::mutate(population = ifelse(population == "IL",  "IRL", population)) %>%
               dplyr::mutate(population = ifelse(population == "JW",  "JRW", population)) %>%
               dplyr::mutate(population = ifelse(population == "PY",  "PNY", population)) %>%
               dplyr::mutate(population = ifelse(population == "K",   "KSH", population)) %>%
               dplyr::mutate(population = ifelse(population == "KO",  "KOR", population)) %>%
               dplyr::mutate(population = ifelse(population == "TH",  "THR", population)) %>%
               dplyr::mutate(population = ifelse(population == "SA",  "SAN", population)) %>%
               dplyr::mutate(population = ifelse(population == "KA",  "KDR", population)) %>%
               dplyr::mutate(population = ifelse(population == "PL1", "PLN", population)) %>%
               dplyr::mutate(population = ifelse(population == "MP",  "MPB", population))


# label subpop for Indians
CastePopoluations <- c("KSH", "GBR", "WBR", "MRT", "IYR", "PLN")
Dravidian         <- c("KDR", "IRL", "PNY")
Austro            <- c("GND", "HO", "SAN", "KOR", "BIR")
TibetoBurman      <- c("MPB", "THR", "TRI", "JAM")
JarwaOnge         <- c("JRW", "ONG")

fam <- dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% CastePopoluations, "IE", subpop)) %>%
       dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% Dravidian, "Dravidian", subpop)) %>%
       dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% Austro, "AA", subpop)) %>%
       dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% TibetoBurman, "TB", subpop)) %>%
       dplyr::mutate(fam, subpop = ifelse(is.na(subpop) & population %in% JarwaOnge, "J&O", subpop)) 

# retain only CSAsia, IE, Dravidian, AA, TB and EAsia
subpop_order <- c('CSAsia', 'IE', 'Dravidian', 'AA', 'TB', 'EAsia')
index <- fam$subpop %in% subpop_order
fam   <- fam[index, ]
X     <- X[index, ]

# reorder individuals
index <- order( match(fam$subpop, subpop_order) )
fam   <- fam[index, ]
X     <- X[index, ]

This code chunk performs a relatively primitive quality control and remove variants according to the following filters:

  1. We obtain the subset of individuals from HGDP and the subset of the individuals from Basu
  2. We remove the variants that the difference between af_hgdp and af_basu is greater than 0.2. This threshold is intended to tackle with the allele flipping issue arised from merging dataset.
  3. We also remove variants that the missing_hgdp or missing_basu is greater than 0.005. This step is intended to only keep variants that have a fair representation from both dataset.
X_hgdp  <- X[ stringr::str_starts(fam$id, "HGDP"), ]
X_basu  <- X[!stringr::str_starts(fam$id, "HGDP"), ]
af_hgdp <- colMeans(X_hgdp, na.rm = TRUE) / 2
af_basu <- colMeans(X_basu, na.rm = TRUE) / 2
idx1    <- abs(af_basu - af_hgdp) < 0.2
idx2    <- (colMeans(is.na(X_hgdp)) < 0.005) & (colMeans(is.na(X_basu)) < 0.005)

X <- X[, idx1 & idx2]

Here we demonstrate how to estimate the individual-level coancestry \(\boldsymbol{\Theta}\) according to the Ochoa-Storey (OS) method by popkin package. So here we only presents the codes but not running them. We provide the pre-computed coancestry at rdata/coanc_os.rda. It should noted that the popkin function returns the kinship coefficients instead of the coancestry coefficients. Therefore, we use the inbr_diag function in the popkin package to map kinship coefficients \(\phi_{jk}\)’s to coancestry coefficients \(\theta_{jk}\)’s:

\[ \theta_{jk} = \begin{cases} 2\phi_{jk} - 1 & j = k \\ \phi_{jk} &j \neq k \end{cases}. \]

obj     <- popkin::popkin_A(t(X))
A_min   <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_os <- 1 - obj$A / A_min
coanc_os    <- popkin::inbr_diag(kinship_os)
coanc_os  <- ifelse(coanc_os < 0, 0, coanc_os)

save(coanc_os,   file = "rdata/coanc_os.rda")
save(fam,        file = "rdata/fam.rda")
save(X,          file = "rdata/X.rda")

We can visualize the individual-level coancestry of the simulated data using plot_popkin function in the popkin function. We use the following helper function plot_colors_subpops to label the sub-populations.

plot_colors_subpops <- function(pops, srt = 0, cex = 0.6, y = FALSE) {
  n <- length(pops)
  k <- unique(pops)
  pops <- factor(pops, levels = unique(pops))
  xintercept <- cumsum(table(pops))
  breaks <- xintercept - 0.5 * as.numeric(table(pops))
  if (y) {
    plot(NULL, xlim = c(0, 1), ylim = c(1, n), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
    text(1, n-rev(breaks), rev(unique(pops)), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
  } else {
    plot(NULL, xlim = c(1, n), ylim = c(0, 1), axes = FALSE, ann = FALSE, xaxs = "i", yaxs = "i")
    text(breaks, 1, unique(pops), cex = cex, srt = srt, xpd = TRUE, adj = c(1, 0.5))
  }
}
load("rdata/coanc_os.rda")
load("rdata/fam.rda")

par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(c(3, 1, 2), c(3, 1, 5), c(0, 4, 0)), widths = c(0.2, 1, 0.2), heights = c(0.5, 0.5, 0.3))

popkin::plot_popkin(kinship = coanc_os, layout_add = FALSE, leg_cex = 0.8, labs_text = FALSE, labs_lwd = 0.1, labs = fam$subpop, ylab = '', leg_title = "Coancestry")

par(mar = c(0.2, 0, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 0.8)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)
par(mar = c(0, 0.2, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 0.8)

3 Estimating admixture proportions

The following chunk of the codes estimates the admixture proportions \(\boldsymbol{Q}\) from genotypes. We first estimate the individual specific allele frequencies \(\boldsymbol{\Pi}\) using the est_p_indiv function in the superadmixture package. We then estimate \(\boldsymbol{Q}\) by decomposing \(\boldsymbol{\Pi}\) with factor_p_indiv function in the superadmixture package. It takes hours to run the following codes, so we’re not running here.

for (k_antepops in 2:15) {
    # estimate individual-specific allele frequencies
    obj <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE)
    p_indiv <- obj$p_indiv
    rowspace <- obj$rowspace
    
    # estimate admix_props by decomposing individual-specific allele frequencies
    obj <- factor_p_indiv(p_indiv = p_indiv,
                          rowspace = rowspace,
                          k_antepops = k_antepops,
                          init_method = "rowspace",
                          verbose = TRUE,
                          max_iters = 1000,
                          tol = 1e-4)
    
    # save admixture proportions 
    Q_hat <- obj$Q_hat
    dir.create(file.path("rdata", "Q_hat"), showWarnings = FALSE)
    save(Q_hat, file = paste0("rdata/Q_hat/", k_antepops, ".rda"))
}

4 Selecting number of antecedent populations

The following chunk of the codes calculates the p-values of structured Hardy-Weinberg Equilibrium test (sHWE) and plots the relationship between the negative entropy of the sHWE p-values and \(K\). It takes hours to run the following codes, so we’re not running here.

for (k_antepops in 2:15) {
    # estimate `rowspace`, an input for sHWE function
   rowspace <- est_p_indiv(X, k_antepops, loci_on_cols = TRUE, rowspace_only = TRUE)
   
   # perform sHWE test
   pvals <- sHWE(X, k_antepops, loci_on_cols = TRUE, rowspace = rowspace) 
   save(pvals, file = paste0("rdata/sHWE/", k_antepops, ".rda"))
}

neg_entropy <- c()
for (k_antepops in 2:15) {
    load(paste0("rdata/sHWE/", k_antepops, ".rda"))
    neg_entropy <- c(neg_entropy, calc_neg_entropy(pvals))
}
save(neg_entropy, file = "rdata/neg_entropy.rda")

We visualize the relationship between \(K\) and the negative entropy of sHWE p-values.

load("rdata/neg_entropy.rda")
plot(2:15, neg_entropy, ylim = c(-2.2, -2.14), xlab = "", ylab = "", type = "b", family="Times New Roman")
mtext(latex2exp::TeX(r"(\textit{$K$})"), side = 1, col = "black", line = 2.5, family="Times New Roman", pch = 10)
mtext("Negative Entropy", side = 2, line = 2.5, col = "black",  family="Times New Roman")

We select \(K = 7\) according to this plot.

5 Estimating antecedent populations coancestry

After obtaining individual-level coancestry \(\boldsymbol{\Theta}\) and admixture proportions \(\boldsymbol{Q}\), we can use the function est_coanc to estimate population coancestry under the super admixture and standard admixture.

load("rdata/coanc_os.rda")
load("rdata/Q_hat/7.rda")

coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")

We then compute the corresponding individual-level coancestry under the super admixture and standard admixture.

coanc_sup <- t(Q_hat) %*% coanc_antepops_sup %*% Q_hat
coanc_std <- t(Q_hat) %*% coanc_antepops_std %*% Q_hat

6 Visualization

We can visualize the coancestry of antecedent populations coanc_antepops_sup and admixture proportions Q_hat using the helper functions get_seq_colors, plot_tree, barplot_admix and heatmap_coanc_antepops we provide.

# reorder antecedent populations according to the value of the population inbredding
index        <- order(diag(coanc_antepops_sup))
Q_hat        <- Q_hat[index, ]
k_antepops   <- 7
coanc_antepops_sup <- coanc_antepops_sup[index, index]

# label antecedent populations
colnames(coanc_antepops_sup) <- rownames(coanc_antepops_sup) <- paste0("S", 1:k_antepops)

# fit tree
tree <- bnpsd::fit_tree(round(coanc_antepops_sup, 6))

# plot an uncolorred tree
plot_tree(tree)

Based on the topology of the tree, we decide to color the populations \(S_1\), \(S_2\) by light blue and dark blue, \(S_3\), \(S_4\), \(S_5\) by light green, medium green and dark green, \(S_6\) and \(S_7\) by light red and dark red. We can pick colors by using the get_seq_colors() function and its returned value can be used to specify the coloring scheme for plot_tree() function.

colors <- c(get_seq_colors("Blues",  2),  get_seq_colors("Greens", 3), get_seq_colors("Reds",   2))
names(colors) <- paste0("S", 1:k_antepops)
fig_tree <- plot_tree(tree, colors = colors)
# flip tree 
fig_tree <- ggtree::flip(fig_tree, get_parent_node("S6", tree), get_parent_node("S4", tree))
fig_tree <- ggtree::flip(fig_tree, get_current_node("S4", tree), get_parent_node("S3", tree))

fig_tree <- ggtree::scaleClade(fig_tree, get_parent_node("S6", tree), 2)
fig_tree <- ggtree::scaleClade(fig_tree, get_parent_node("S4", tree), 0.5)
fig_tree

We can visualize admixture proportions Q_hat using the barplot_admix function.

fig_admix <- barplot_admix(Q_hat, colors = colors, subpops = fam$subpop)
fig_admix 

We also can visualize the coancestry among antecedent populations by heatmap_coanc_antepops. We define the helper function heatmap_coanc_antepops_wrapper that returns a ggplot object of the heatmap.

heatmap_coanc_antepops_wrapper <- function(coanc_antepops, tl.cex = 1.3, cl.cex = 1, tl.offset = 0.6) {
  superadmixture::heatmap_coanc_antepops(coanc_antepops, tl.cex = tl.cex, cl.cex = cl.cex, tl.offset = tl.offset)
  grab_grob <- function(){
    gridGraphics::grid.echo()
    grid::grid.grab()
  }
  p <- grab_grob()
 
 # save correlation matrix colors to a vector, then make coloured matrix grob transparent
 matrix.colors <- grid::getGrob(p, grid::gPath("square"), grep = TRUE)[["gp"]][["fill"]]
 p <- grid::editGrob(p, grid::gPath("square"), grep = TRUE, gp = grid::gpar(col = NA, fill = NA))

 # apply the saved colours to the underlying matrix grob
 p <- grid::editGrob(p, grid::gPath("symbols-rect-1"), grep = TRUE, gp = grid::gpar(fill = matrix.colors))

 # convert the background fill from white to transparent, while we are at it
 p <- grid::editGrob(p, grid::gPath("background"), grep = TRUE, gp = grid::gpar(fill = NA))
 p <- ggplotify::as.ggplot(p) + ggplot2::theme(plot.margin = unit(c(0, 0, 0, 0), "pt"))
 return(p)
}
par(xpd = TRUE)
fig_coancestry <- heatmap_coanc_antepops_wrapper(coanc_antepops_sup, tl.cex = 1.1, tl.offset = 1)
fig_coancestry <- fig_coancestry + theme(plot.margin = margin(0, 0, 0, 0, "pt"))
coancestry_lab <- ggplot() + 
  annotate("text", x = 0.5, y = 0.5, size = 5, label = "Coancestry") + 
  xlim(0, 1) + 
  ylim(0, 1) + 
  theme_void()

fig_coancestry <- ggarrange(fig_coancestry, coancestry_lab, ncol = 1, heights = c(1, 0.1))
fig_coancestry

We combine all plots.

# combine plots of tree, admix props, subpops and coancestry
design <- "
  112
  113
  ##3
"

fig_coancestry + fig_tree + fig_admix + 
  patchwork::plot_layout(
            design = design,
            widths = c(0.6, 0.2, 1.4), 
            heights = c(0.6, 0.2, 0.4)) +
  patchwork::plot_annotation(tag_levels = 'A', tag_prefix = '(', tag_suffix = ')') &
  ggplot2::theme(plot.tag = ggplot2::element_text(color = "black", size = 21, face = 'bold'))

7 Simulating genotypes from the structure of IND study

We can simulate genotypes using the double-admixture method with the dbl_admixture function. It takes about an hour to run the following codes, so we’re not running here.

X <- as.matrix(X)
p_anc <- 0.5 * colMeans(X, na.rm = TRUE)
coanc_antepops <- round(coanc_antepops, 6)
X_sim <- dbl_admixture(p_anc, coanc_antepops, Q_hat, geno_only = TRUE)

## estimate kinship
obj         <- popkin::popkin_A(X_sim)
A_min   <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os  <- 1 - obj$A / A_min

## mapping kinship coefficients to coancestry coefficients
coanc_sim_os  <- popkin::inbr_diag(kinship_sim_os)
coanc_sim_os  <- ifelse(coanc_sim_os < 0, 0, coanc_sim_os)
save(coanc_sim_os,  file = "rdata/coanc_sim_os.rda")

We can also use NORTA method to simulate genotypes. It takes a few hours to run the following codes, so we’re not running here.

X_sim_norta <- norta_approx(p_anc, coanc_antepops, Q_hat,
              geno_only = TRUE, parallel = TRUE, method = "numeric", tol = 1e-5, mc_cores = 20)

## estimate kinship
obj         <- popkin::popkin_A(X_sim_norta)
A_min   <- mean(sort(obj$A[lower.tri(obj$A)])[1:100])
kinship_sim_os_norta  <- 1 - obj$A / A_min

## mapping kinship coefficients to coancestry coefficients
coanc_sim_os_norta    <- popkin::inbr_diag(kinship_sim_os_norta)
coanc_sim_os_norta    <- ifelse(coanc_sim_os_norta < 0, 0, coanc_sim_os_norta)
save(coanc_sim_os_norta,   file = "rdata/coanc_sim_os_norta.rda")

We plot the individual-level coancestry.

load("rdata/coanc_sim_os.rda")
load("rdata/coanc_sim_os_norta.rda")

par(mar = c(0, 0, 0, 0) + 0.2)
layout(rbind(
   c(14, 19, 15, 20, 16, 21,  0),
   c( 7,  1,  0,  2,  0,  3,  6),
   c( 0,  9,  0, 10,  0, 11,  0),
   c(17, 22, 18, 23,  0,  0,  0),
   c( 8,  4,  0,  5,  0,  0,  0),
   c( 0, 12,  0, 13,  0,  0,  0)),
  heights = c(0.2, 1, 0.22, 0.2, 1, 0.22),
  widths  = c(0.2, 1, 0.08, 1, 0.08, 1, 0.2))

coanc_sim_os       <- (coanc_sim_os - 1) * (1 - min(coanc_sup)) + 1
coanc_sim_os_norta <- (coanc_sim_os_norta - 1) * (1 - min(coanc_sup)) + 1
# We truncate the large entries of `coanc_std`
# coanc_std  <- ifelse(coanc_std > max(coanc_os), max(coanc_os), coanc_std)

popkin::plot_popkin(kinship = list(coanc_os, coanc_sup, coanc_std, coanc_sim_os, coanc_sim_os_norta),
                    layout_add = FALSE,
                    leg_cex = 0.8,
                    labs_text = FALSE,
                    labs_lwd = 0.1,
                    labs = fam$subpop,
                    ylab = '',
                    leg_title = "Coancestry",
                    panel_letters = NULL)

par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)

par(mar = c(0.5, 0.5, 0.2, 0))
plot_colors_subpops(fam$subpop, y = TRUE, cex = 1.1)
mtext('Individuals', side = 2, line = 0.5, xpd = NA, cex = 0.8)

par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)
par(mar = c(0, 0.5, 0, 0.2))
plot_colors_subpops(fam$subpop, srt = 90, cex = 1.1)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(A)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(B)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(C)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(D)", cex = 2, xpd = TRUE, font = 2)

par(mar = c(0, 0, 0, 0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.3, "(E)", cex = 2, xpd = TRUE, font = 2)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof real data", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Super admixture individual coancestry\n of real data", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "Standard admixture individual coancestry\n of real data", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (double-admixture)", cex = 1.6)

plot(c(0, 1), c(0, 1), ann = F, bty = 'n', type = 'n', xaxt = 'n', yaxt = 'n')
text(0.5, 0.4, "OS individual coancestry\nof bootstrap data (NORTA)", cex = 1.6)

8 Confirming significant hypothesis tests of standard admixture versus super admixture

We perform the significance test of coancestry among antecedent populations to compare the model fit between standard admixture versus super admixture. The following chunk of codes is used to generate the null distribution of the test statistics \(U\). It should be noted that it takes days to run this chunk of code. We submit each iteration as a separate cluster job to speed up (not show here). Therefore, we’re just showing the code but not running it. We provide the pre-computed null test statistics in the file rdata/test_stats0.rda.

for (task_id in 1:1000) {
  ## compute null test statistics
  inbr_antepops <- diag(est_coanc(coanc_os, Q_hat, model = "standard"))

  ## compute null
  test_stats0 <- compute_null(p_anc, Q_hat, inbr_antepops, verbose = TRUE)

  # save results
  dir.create(file.path("rdata", "test_stat0"), showWarnings = FALSE)
  save(test_stat0, file = paste0("rdata/test_stat0/", task_id, ".rda"))
}

test_stats0 <- c()
for (task_id in 1:1000) {
  load(paste0("rdata/test_stat0/", task_id, ".rda"))
  test_stats0 <- c(test_stats0, test_stat0)
}
save(test_stats0, file = "rdata/test_stats0.rda")

We plot the distribution of null test statistics and label our observed test statistics.

load("rdata/test_stats0.rda")
load("rdata/coanc_os.rda")
load("rdata/Q_hat/7.rda")

coanc_antepops_sup <- est_coanc(coanc_os, Q_hat, model = "super")
coanc_antepops_std <- est_coanc(coanc_os, Q_hat, model = "standard")

test_stat1 <- norm(coanc_antepops_sup - coanc_antepops_std, "F")
p <- data.frame(test_stats0 = test_stats0) %>%
     ggplot(data, mapping = aes(x = test_stats0)) +
        geom_histogram(aes(y =..density..), 
                 fill  = "dodgerblue3", 
                 bins  = 10,
                 binwidth = NULL,
                 color = "black", 
                 alpha = 0.7, 
                 size  = 0.1) +
    labs(x = latex2exp::TeX(r"(Null test-statistics)"), y='Density') +
    scale_y_continuous(limits = c(0, 250)) +
    theme_bw(base_size = 16) +
    ggtitle("Indian study (IND)") +
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(), 
    plot.title = element_text(hjust = 0.5, size=16)) +
    annotate("text", x = 0.051, y = 240, size = 6, label = latex2exp::TeX(sprintf("$U_{obs}$ = %.3f", test_stat1)))
p